In [2]:
import sys, os
sys.path.insert(0, os.path.join("..", "..", ".."))
We follow Rosser et al. and use a maximum-likelihood approach to finding the "best" parameters for the time and space bandwidths.
In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.collections
import numpy as np
import shelve
import open_cp.network
import open_cp.geometry
import open_cp.network_hotspot
import open_cp.logger
open_cp.logger.log_to_true_stdout()
In [4]:
import pickle, lzma
with lzma.open("input_old.pic.xz", "rb") as f:
timed_points = pickle.load(f)
with open("input.graph", "rb") as f:
graph = open_cp.network.PlanarGraph.from_bytes(f.read())
In [5]:
trainer = open_cp.network_hotspot.Trainer()
trainer.graph = graph
trainer.maximum_edge_length = 20
trainer.data = timed_points
predictor = trainer.compile()
In [6]:
def log_likelihood(result, network_timed_points):
logli = 0
for s, e in zip(network_timed_points.start_keys, network_timed_points.end_keys):
edge_index, _ = result.graph.find_edge(s,e)
if result.risks[edge_index] == 0:
logli -= 27.6
else:
logli += np.log(result.risks[edge_index])
return logli
In [7]:
timed_points.time_range
Out[7]:
In [8]:
tstart = np.datetime64("2013-01-01")
tend = np.datetime64("2013-01-01") + np.timedelta64(180, "D")
def score(predictor):
out = 0
risks = dict()
for day in range(60):
start = tend + np.timedelta64(1, "D") * day
end = tend + np.timedelta64(1, "D") * (day + 1)
result = predictor.predict(cutoff_time=tstart, predict_time=start)
ntp = predictor.network_timed_points
mask = (ntp.timestamps > start) & (ntp.timestamps <= end)
ntp = ntp[mask]
out += log_likelihood(result, ntp)
risks[start] = result.risks
return out, risks
In [13]:
predictor = open_cp.network_hotspot.FastPredictor(predictor, 2000)
In [14]:
time_lengths = list(range(5,100,5))
space_lengths = list(range(50, 2000, 50))
In [15]:
results = dict()
for sl in space_lengths:
predictor.kernel = open_cp.network_hotspot.TriangleKernel(sl)
for tl in time_lengths:
predictor.time_kernel = open_cp.network_hotspot.ExponentialTimeKernel(tl)
results[ (sl, tl) ], _ = score(predictor)
In [16]:
with open("cross_validate.pic", "wb") as f:
pickle.dump(results, f)
We see something a bit different to the paper of Rosser et al.
This of course might simply be a genuine difference in the data. Rosser et al use London, UK data, and this is from Chicago.
In [10]:
time_lengths = list(range(5,100,5))
space_lengths = list(range(50, 2000, 50))
with open("cross_validate.pic", "rb") as f:
results = pickle.load(f)
In [17]:
data = np.empty((39,19))
for i, sl in enumerate(space_lengths):
for j, tl in enumerate(time_lengths):
data[i,j] = results[(sl,tl)]
ordered = data.copy().ravel()
ordered.sort()
cutoff = ordered[int(len(ordered) * 0.25)]
data = np.ma.masked_where(data<cutoff, data)
In [18]:
fig, ax = plt.subplots(figsize=(8,6))
mappable = ax.pcolor(range(5,105,5), range(50,2050,50), data, cmap="Blues")
ax.set(xlabel="Time (days)", ylabel="Space (meters)")
fig.colorbar(mappable, ax=ax)
None
In [19]:
print(max(results.values()))
[k for k, v in results.items() if v > -5645]
Out[19]:
In [ ]:
pred = open_cp.network_hotspot.ApproxPredictorCaching(predictor)
time_lengths = list(range(5,100,5))
space_lengths = list(range(50, 2000, 50))
In [ ]:
results = dict()
for sl in space_lengths:
pred.kernel = open_cp.network_hotspot.TriangleKernel(sl)
for tl in time_lengths:
pred.time_kernel = open_cp.network_hotspot.ExponentialTimeKernel(tl)
key = (sl, tl)
results[key], _ = score(pred)
In [ ]:
with open("cross_validate_approx.pic", "wb") as f:
pickle.dump(results, f)
Exactly the same methodolody as above. We see what we might expect, given that this is an approximation that would only be completely accurate in the absence of loops in the geometry. Namely, the maximum likelihood occurs for slightly shorter space distance, and slightly longer time distance.
In [8]:
time_lengths = list(range(5,100,5))
space_lengths = list(range(50, 2000, 50))
with open("cross_validate_approx.pic", "rb") as f:
results = pickle.load(f)
In [9]:
len(space_lengths), len(time_lengths)
Out[9]:
In [10]:
data = np.empty((39,19))
for i, sl in enumerate(space_lengths):
for j, tl in enumerate(time_lengths):
data[i,j] = results[(sl,tl)]
ordered = data.copy().ravel()
ordered.sort()
cutoff = ordered[int(len(ordered) * 0.25)]
data = np.ma.masked_where(data<cutoff, data)
In [11]:
fig, ax = plt.subplots(figsize=(8,6))
mappable = ax.pcolor(range(5,105,5), range(50,2050,50), data, cmap="Blues")
ax.set(xlabel="Time (days)", ylabel="Space (meters)")
fig.colorbar(mappable, ax=ax)
None
In [12]:
print(max(results.values()))
[k for k, v in results.items() if v > -5758]
Out[12]:
In [ ]:
In [ ]:
In [ ]: